<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>

<meta charset="utf-8">
<meta name="generator" content="quarto-1.2.198">

<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">


<title>R语言医学数据科学入门 - 1&nbsp; Hello data</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
  width: 0.8em;
  margin: 0 0.8em 0.2em -1.6em;
  vertical-align: middle;
}
</style>


<script src="site_libs/quarto-nav/quarto-nav.js"></script>
<script src="site_libs/quarto-nav/headroom.min.js"></script>
<script src="site_libs/clipboard/clipboard.min.js"></script>
<script src="site_libs/quarto-search/autocomplete.umd.js"></script>
<script src="site_libs/quarto-search/fuse.min.js"></script>
<script src="site_libs/quarto-search/quarto-search.js"></script>
<meta name="quarto:offset" content="./">
<link href="./02-Data types.html" rel="next">
<link href="./intro.html" rel="prev">
<script src="site_libs/quarto-html/quarto.js"></script>
<script src="site_libs/quarto-html/popper.min.js"></script>
<script src="site_libs/quarto-html/tippy.umd.min.js"></script>
<script src="site_libs/quarto-html/anchor.min.js"></script>
<link href="site_libs/quarto-html/tippy.css" rel="stylesheet">
<link href="site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="site_libs/bootstrap/bootstrap.min.js"></script>
<link href="site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
<script id="quarto-search-options" type="application/json">{
  "location": "sidebar",
  "copy-button": false,
  "collapse-after": 3,
  "panel-placement": "start",
  "type": "textbox",
  "limit": 20,
  "language": {
    "search-no-results-text": "No results",
    "search-matching-documents-text": "matching documents",
    "search-copy-link-title": "Copy link to search",
    "search-hide-matches-text": "Hide additional matches",
    "search-more-match-text": "more match in this document",
    "search-more-matches-text": "more matches in this document",
    "search-clear-button-title": "Clear",
    "search-detached-cancel-button-title": "Cancel",
    "search-submit-button-title": "Submit"
  }
}</script>

<link href="site_libs/pagedtable-1.1/css/pagedtable.css" rel="stylesheet">
<script src="site_libs/pagedtable-1.1/js/pagedtable.js"></script>


</head>

<body class="nav-sidebar floating">

<div id="quarto-search-results"></div>
  <header id="quarto-header" class="headroom fixed-top">
  <nav class="quarto-secondary-nav" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
    <div class="container-fluid d-flex justify-content-between">
      <h1 class="quarto-secondary-nav-title"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Hello data</span></h1>
      <button type="button" class="quarto-btn-toggle btn" aria-label="Show secondary navigation">
        <i class="bi bi-chevron-right"></i>
      </button>
    </div>
  </nav>
</header>
<!-- content -->
<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
<!-- sidebar -->
  <nav id="quarto-sidebar" class="sidebar collapse sidebar-navigation floating overflow-auto">
    <div class="pt-lg-2 mt-2 text-left sidebar-header">
    <div class="sidebar-title mb-0 py-0">
      <a href="./">R语言医学数据科学入门</a> 
        <div class="sidebar-tools-main">
    <a href="" title="Share" id="sidebar-tool-dropdown-0" class="sidebar-tool dropdown-toggle px-1" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi bi-share"></i></a>
    <ul class="dropdown-menu" aria-labelledby="sidebar-tool-dropdown-0">
        <li>
          <a class="dropdown-item sidebar-tools-main-item" href="https://twitter.com/intent/tweet?url=|url|">
            <i class="bi bi-bi-twitter pe-1"></i>
          Twitter
          </a>
        </li>
        <li>
          <a class="dropdown-item sidebar-tools-main-item" href="https://www.facebook.com/sharer/sharer.php?u=|url|">
            <i class="bi bi-bi-facebook pe-1"></i>
          Facebook
          </a>
        </li>
    </ul>
</div>
    </div>
      </div>
      <div class="mt-2 flex-shrink-0 align-items-center">
        <div class="sidebar-search">
        <div id="quarto-search" class="" title="Search"></div>
        </div>
      </div>
    <div class="sidebar-menu-container"> 
    <ul class="list-unstyled mt-1">
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./index.html" class="sidebar-item-text sidebar-link">Preface</a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./intro.html" class="sidebar-item-text sidebar-link">Introduction</a>
  </div>
</li>
        <li class="sidebar-item sidebar-item-section">
      <div class="sidebar-item-container"> 
            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">Data things</a>
          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
            <i class="bi bi-chevron-right ms-2"></i>
          </a> 
      </div>
      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
          <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./01-hello_data.html" class="sidebar-item-text sidebar-link active"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Hello data</span></a>
  </div>
</li>
          <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./02-Data types.html" class="sidebar-item-text sidebar-link"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">数据结构和类型</span></a>
  </div>
</li>
          <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./03-tidy_data.html" class="sidebar-item-text sidebar-link"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Tidy data</span></a>
  </div>
</li>
          <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./04-data IO.html" class="sidebar-item-text sidebar-link"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Data I/O</span></a>
  </div>
</li>
      </ul>
  </li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./tips.html" class="sidebar-item-text sidebar-link">Tips</a>
  </div>
</li>
        <li class="sidebar-item">
  <div class="sidebar-item-container"> 
  <a href="./references.html" class="sidebar-item-text sidebar-link">References</a>
  </div>
</li>
    </ul>
    </div>
</nav>
<!-- margin-sidebar -->
    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
        <nav id="TOC" role="doc-toc" class="toc-active">
    <h2 id="toc-title">Table of contents</h2>
   
  <ul>
  <li><a href="#什么是数据" id="toc-什么是数据" class="nav-link active" data-scroll-target="#什么是数据"><span class="toc-section-number">1.1</span>  什么是数据</a></li>
  <li><a href="#数据的类型" id="toc-数据的类型" class="nav-link" data-scroll-target="#数据的类型"><span class="toc-section-number">1.2</span>  数据的类型</a>
  <ul class="collapse">
  <li><a href="#文本数据" id="toc-文本数据" class="nav-link" data-scroll-target="#文本数据"><span class="toc-section-number">1.2.1</span>  文本数据</a></li>
  <li><a href="#其他数据" id="toc-其他数据" class="nav-link" data-scroll-target="#其他数据"><span class="toc-section-number">1.2.2</span>  其他数据</a></li>
  </ul></li>
  </ul>
</nav>
    </div>
<!-- main -->
<main class="content" id="quarto-document-content">

<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title d-none d-lg-block"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Hello data</span></h1>
</div>



<div class="quarto-title-meta">

    
  
    
  </div>
  

</header>

<div class="cell">

</div>
<p>这部分我们首先要了解的是数据类型和整洁数据（tidy data）。</p>
<section id="什么是数据" class="level2" data-number="1.1">
<h2 data-number="1.1" class="anchored" data-anchor-id="什么是数据"><span class="header-section-number">1.1</span> 什么是数据</h2>
<p>我的理解，凡是能够储存在存储介质里的都是数据。不同的计算机从业者，看待数据的角度是不同的。如数据修复，更看重数据的底层，必须知道数据如何存储，如何在winhex那一串的数字中寻找答案；操作系统，更关心的是数据存储分配，存储共享，存储保护，存储扩张等。</p>
<p>在这个教程（书）中，我们关心的是具备一定<strong>规范、规律</strong>，且具备<strong>分析价值</strong>的数据。我们要学会导入、清洗、分析、利用好这样的数据，这也就是所谓的”数据科学（data science）“。</p>
</section>
<section id="数据的类型" class="level2" data-number="1.2">
<h2 data-number="1.2" class="anchored" data-anchor-id="数据的类型"><span class="header-section-number">1.2</span> 数据的类型</h2>
<p>数据科学家（暂且用这么高端的名字吧）认为具备分析价值的数据是什么样的呢？我想到了以下数据类型，当然有些数据是原始状态，在正式进入分析以前，是需要先做预处理的（Pre-processing）。下面我列举一些常见的数据类型。</p>
<section id="文本数据" class="level3" data-number="1.2.1">
<h3 data-number="1.2.1" class="anchored" data-anchor-id="文本数据"><span class="header-section-number">1.2.1</span> 文本数据</h3>
<p>在这里，我把文本形式存储的数据全部叫文本数据。这些数据，有些是按照事先约定好的形式和格式采集和录入而形成的表格型数据（像excel），也有普通的一本书、一篇文章这样的纯粹文字数据。其中，表格型数据（dataframe）是我们使用最多，最需要掌握的数据类型。</p>
<div class="callout-note callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Note
</div>
</div>
<div class="callout-body-container callout-body">
<p>这里我是这样讲文本数据的定义，其他专家可能有不同归类或理解</p>
</div>
</div>
<ol type="1">
<li>数据框（dataframe）</li>
</ol>
<p><code>dataframe</code> 顾名思义，也叫方形数据（Rectangular Data）。Python（pandas包）和R都将这种数据称为dataframe，也是最常见的数据对象。 在R中，还有<code>tibble</code>和<code>data.table</code>这类<code>dataframe</code>的变种，本质上还是方形数据，只不过用来处理的包和风格不太相同。</p>
<p>这就是一个<code>dataframe</code>例子</p>
<div class="cell">
<div class="cell-output-display">
<table class="table table-sm table-striped">
<thead>
<tr class="header">
<th style="text-align: left;">agegp</th>
<th style="text-align: left;">alcgp</th>
<th style="text-align: left;">tobgp</th>
<th style="text-align: right;">ncases</th>
<th style="text-align: right;">ncontrols</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td style="text-align: left;">25-34</td>
<td style="text-align: left;">0-39g/day</td>
<td style="text-align: left;">0-9g/day</td>
<td style="text-align: right;">0</td>
<td style="text-align: right;">40</td>
</tr>
<tr class="even">
<td style="text-align: left;">25-34</td>
<td style="text-align: left;">0-39g/day</td>
<td style="text-align: left;">10-19</td>
<td style="text-align: right;">0</td>
<td style="text-align: right;">10</td>
</tr>
<tr class="odd">
<td style="text-align: left;">25-34</td>
<td style="text-align: left;">0-39g/day</td>
<td style="text-align: left;">20-29</td>
<td style="text-align: right;">0</td>
<td style="text-align: right;">6</td>
</tr>
<tr class="even">
<td style="text-align: left;">25-34</td>
<td style="text-align: left;">0-39g/day</td>
<td style="text-align: left;">30+</td>
<td style="text-align: right;">0</td>
<td style="text-align: right;">5</td>
</tr>
<tr class="odd">
<td style="text-align: left;">25-34</td>
<td style="text-align: left;">40-79</td>
<td style="text-align: left;">0-9g/day</td>
<td style="text-align: right;">0</td>
<td style="text-align: right;">27</td>
</tr>
<tr class="even">
<td style="text-align: left;">25-34</td>
<td style="text-align: left;">40-79</td>
<td style="text-align: left;">10-19</td>
<td style="text-align: right;">0</td>
<td style="text-align: right;">7</td>
</tr>
<tr class="odd">
<td style="text-align: left;">25-34</td>
<td style="text-align: left;">40-79</td>
<td style="text-align: left;">20-29</td>
<td style="text-align: right;">0</td>
<td style="text-align: right;">4</td>
</tr>
<tr class="even">
<td style="text-align: left;">25-34</td>
<td style="text-align: left;">40-79</td>
<td style="text-align: left;">30+</td>
<td style="text-align: right;">0</td>
<td style="text-align: right;">7</td>
</tr>
</tbody>
</table>
</div>
</div>
<div class="callout-tip callout callout-style-default callout-captioned">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-caption-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>我们后面还会详细讲解Rectangular Data，尤其是tidy data作为一种规范的dataframe，请稍安勿躁。</p>
</div>
</div>
<ol start="2" type="1">
<li>纯粹文本Plain text</li>
</ol>
<p>纯文本，因为它纯了，就是普通不加任何清洗和修改的自然文字段落。</p>
</section>
<section id="其他数据" class="level3" data-number="1.2.2">
<h3 data-number="1.2.2" class="anchored" data-anchor-id="其他数据"><span class="header-section-number">1.2.2</span> 其他数据</h3>
<ul>
<li><p>时间序列Time series data</p>
<p>大部分信号处理都是这类数据，比如心电图、股票、声波。</p></li>
<li><p>空间数据Spatial data</p>
<p>地图、导航类似的数据。</p></li>
<li><p>图神经网络Graph (or network) data</p>
<blockquote class="blockquote">
<p>In computer science and information technology, the term graph typically refers to a depiction of the connections among entities, and to the underlying data structure.在计算机科学和信息技术中，术语”图”通常指实体之间的连接和底层数据结构的描述。如图神经网络。</p>
</blockquote></li>
<li><p>图像和视频数据image data and video data</p></li>
</ul>


</section>
</section>

</main> <!-- /main -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
  const toggleBodyColorMode = (bsSheetEl) => {
    const mode = bsSheetEl.getAttribute("data-mode");
    const bodyEl = window.document.querySelector("body");
    if (mode === "dark") {
      bodyEl.classList.add("quarto-dark");
      bodyEl.classList.remove("quarto-light");
    } else {
      bodyEl.classList.add("quarto-light");
      bodyEl.classList.remove("quarto-dark");
    }
  }
  const toggleBodyColorPrimary = () => {
    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
    if (bsSheetEl) {
      toggleBodyColorMode(bsSheetEl);
    }
  }
  toggleBodyColorPrimary();  
  const icon = "";
  const anchorJS = new window.AnchorJS();
  anchorJS.options = {
    placement: 'right',
    icon: icon
  };
  anchorJS.add('.anchored');
  const clipboard = new window.ClipboardJS('.code-copy-button', {
    target: function(trigger) {
      return trigger.previousElementSibling;
    }
  });
  clipboard.on('success', function(e) {
    // button target
    const button = e.trigger;
    // don't keep focus
    button.blur();
    // flash "checked"
    button.classList.add('code-copy-button-checked');
    var currentTitle = button.getAttribute("title");
    button.setAttribute("title", "Copied!");
    let tooltip;
    if (window.bootstrap) {
      button.setAttribute("data-bs-toggle", "tooltip");
      button.setAttribute("data-bs-placement", "left");
      button.setAttribute("data-bs-title", "Copied!");
      tooltip = new bootstrap.Tooltip(button, 
        { trigger: "manual", 
          customClass: "code-copy-button-tooltip",
          offset: [0, -8]});
      tooltip.show();    
    }
    setTimeout(function() {
      if (tooltip) {
        tooltip.hide();
        button.removeAttribute("data-bs-title");
        button.removeAttribute("data-bs-toggle");
        button.removeAttribute("data-bs-placement");
      }
      button.setAttribute("title", currentTitle);
      button.classList.remove('code-copy-button-checked');
    }, 1000);
    // clear code selection
    e.clearSelection();
  });
  function tippyHover(el, contentFn) {
    const config = {
      allowHTML: true,
      content: contentFn,
      maxWidth: 500,
      delay: 100,
      arrow: false,
      appendTo: function(el) {
          return el.parentElement;
      },
      interactive: true,
      interactiveBorder: 10,
      theme: 'quarto',
      placement: 'bottom-start'
    };
    window.tippy(el, config); 
  }
  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
  for (var i=0; i<noterefs.length; i++) {
    const ref = noterefs[i];
    tippyHover(ref, function() {
      // use id or data attribute instead here
      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
      try { href = new URL(href).hash; } catch {}
      const id = href.replace(/^#\/?/, "");
      const note = window.document.getElementById(id);
      return note.innerHTML;
    });
  }
  const findCites = (el) => {
    const parentEl = el.parentElement;
    if (parentEl) {
      const cites = parentEl.dataset.cites;
      if (cites) {
        return {
          el,
          cites: cites.split(' ')
        };
      } else {
        return findCites(el.parentElement)
      }
    } else {
      return undefined;
    }
  };
  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
  for (var i=0; i<bibliorefs.length; i++) {
    const ref = bibliorefs[i];
    const citeInfo = findCites(ref);
    if (citeInfo) {
      tippyHover(citeInfo.el, function() {
        var popup = window.document.createElement('div');
        citeInfo.cites.forEach(function(cite) {
          var citeDiv = window.document.createElement('div');
          citeDiv.classList.add('hanging-indent');
          citeDiv.classList.add('csl-entry');
          var biblioDiv = window.document.getElementById('ref-' + cite);
          if (biblioDiv) {
            citeDiv.innerHTML = biblioDiv.innerHTML;
          }
          popup.appendChild(citeDiv);
        });
        return popup.innerHTML;
      });
    }
  }
});
</script>
<nav class="page-navigation">
  <div class="nav-page nav-page-previous">
      <a href="./intro.html" class="pagination-link">
        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text">Introduction</span>
      </a>          
  </div>
  <div class="nav-page nav-page-next">
      <a href="./02-Data types.html" class="pagination-link">
        <span class="nav-page-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">数据结构和类型</span></span> <i class="bi bi-arrow-right-short"></i>
      </a>
  </div>
</nav>
</div> <!-- /content -->



</body></html>